##Importing Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(shiny)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(leaps)
library(knitr)
library(ggplot2)
library("reshape2")
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(class)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(tree)
library(rpart)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:rattle':
## 
##     importance
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(readxl)
library(moments)
library(FactoMineR)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following object is masked from 'package:bitops':
## 
##     %&%
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
library(corrplot)
## corrplot 0.92 loaded
library(rsample)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

Importing the cleaned dataset

dataset_s1 = read.csv("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/dataset_s1_cleaned.csv")
dataset_before_cleaning_s1 = read_xlsx("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/S1 File.xlsx")

Checking for Null values

table(is.na(dataset_s1))
## 
## FALSE 
## 31239

No Null values

#Removing 2 values that are preventing The.number.of.crime.incidents from being a number
dataset_s1$The.number.of.crime.incidents = ifelse(dataset_s1$The.number.of.crime.incidents == '-', NA, dataset_s1$The.number.of.crime.incidents)
dataset_s1 = na.omit(dataset_s1)
dataset_s1$The.number.of.crime.incidents = as.numeric(dataset_s1$The.number.of.crime.incidents)

dataset_s1 <- mutate_all(dataset_s1, as.numeric)

dataset_before_cleaning_s1$`The number of crime incidents` = 
  ifelse(dataset_before_cleaning_s1$`The number of crime incidents` == '-', NA,
         dataset_before_cleaning_s1$`The number of crime incidents`)
dataset_before_cleaning_s1 = na.omit(dataset_before_cleaning_s1)
dataset_before_cleaning_s1$`The number of crime incidents` = as.numeric(dataset_before_cleaning_s1$`The number of crime incidents`)

Structure of the dataset

str(dataset_s1)
## 'data.frame':    799 obs. of  39 variables:
##  $ Census.tract                                                                                                : num  101 102 102 103 104 ...
##  $ The.number.of.crime.incidents                                                                               : num  564 452 458 360 225 192 110 110 367 175 ...
##  $ Total.population                                                                                            : num  4189 7083 2502 6213 4730 ...
##  $ Median.age                                                                                                  : num  34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
##  $ White                                                                                                       : num  2073 3198 1099 3429 3427 ...
##  $ Black.or.African.American                                                                                   : num  1687 3545 839 1806 660 ...
##  $ American.Indian.or.Alaska.Native                                                                            : num  0 16 6 5 0 0 0 0 4 0 ...
##  $ Asian                                                                                                       : num  162 148 248 741 395 100 638 326 106 200 ...
##  $ Native.Hawaiian.and.Other.Pacific.Islander                                                                  : num  0 0 18 0 0 0 0 0 0 0 ...
##  $ Some.other.race                                                                                             : num  93 41 208 148 93 17 246 12 87 412 ...
##  $ Two.or.more.races                                                                                           : num  174 135 84 84 155 194 88 96 319 114 ...
##  $ Hispanic.or.Latino                                                                                          : num  456 1571 658 964 454 ...
##  $ Not.Hispani.or.Latino                                                                                       : num  3733 5512 1844 5249 4276 ...
##  $ Total.housing.units                                                                                         : num  2614 2995 1236 3258 2178 ...
##  $ Vacant.housing.units                                                                                        : num  467 489 167 563 341 324 389 253 133 129 ...
##  $ Median.housing.value                                                                                        : num  191600 169300 165700 195000 221700 ...
##  $ Percent.of.less.than.9th.grade                                                                              : num  1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
##  $ Percent.of.9th.to.12th.grade                                                                                : num  2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
##  $ Percent.of.high.school.graduate                                                                             : num  21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
##  $ Percent.of.some.college                                                                                     : num  31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
##  $ Percent.of.associate.s.degree                                                                               : num  3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
##  $ Percent.of.bachelor.s.degree                                                                                : num  22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
##  $ Percent.of.graduate.or.professional.degree                                                                  : num  16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
##  $ Percent.of.employed.population                                                                              : num  63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
##  $ Percent.of.unemployed.population                                                                            : num  11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
##  $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining                                : num  0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
##  $ Percent.of.population.of.construction                                                                       : num  6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
##  $ Percent.of.population.of.manufacturing                                                                      : num  4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
##  $ Percent.of.population.of.wholesale.trade                                                                    : num  0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
##  $ Percent.of.population.of.retail.trade                                                                       : num  6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
##  $ Percent.of.population.of.transportation..warehousing..and.utilities                                         : num  5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
##  $ Percent.of.population.of.information                                                                        : num  0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
##  $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing                               : num  5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
##  $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num  14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
##  $ Percent.of.population.of.educational.services..health.care..and.social.assistance                           : num  36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
##  $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services                  : num  9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
##  $ Percent.of.population.of.public.administration                                                              : num  5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
##  $ Percent.of.population.of.other.services                                                                     : num  4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
##  $ Mean.income                                                                                                 : num  58908 68583 54897 83002 96641 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 800 801
##   ..- attr(*, "names")= chr [1:2] "800" "801"
#We have one character. Converting to number

Statistical and Exploratory Data Analysis

dataset_s1_copy = dataset_before_cleaning_s1
summary(dataset_s1)
##   Census.tract  The.number.of.crime.incidents Total.population   Median.age   
##  Min.   : 101   Min.   :   3.0                Min.   :    0    Min.   :17.50  
##  1st Qu.:1608   1st Qu.: 154.5                1st Qu.: 2034    1st Qu.:30.20  
##  Median :3514   Median : 259.0                Median : 3128    Median :33.10  
##  Mean   :4039   Mean   : 334.3                Mean   : 3424    Mean   :34.23  
##  3rd Qu.:6702   3rd Qu.: 413.5                3rd Qu.: 4504    3rd Qu.:38.10  
##  Max.   :8439   Max.   :3217.0                Max.   :17582    Max.   :63.50  
##      White      Black.or.African.American American.Indian.or.Alaska.Native
##  Min.   :   0   Min.   :   0              Min.   :  0.000                 
##  1st Qu.: 118   1st Qu.:  78              1st Qu.:  0.000                 
##  Median :1439   Median : 350              Median :  0.000                 
##  Mean   :1663   Mean   :1087              Mean   :  9.083                 
##  3rd Qu.:2694   3rd Qu.:1658              3rd Qu.:  7.000                 
##  Max.   :8764   Max.   :7063              Max.   :219.000                 
##      Asian        Native.Hawaiian.and.Other.Pacific.Islander Some.other.race
##  Min.   :   0.0   Min.   : 0.0000                            Min.   :   0   
##  1st Qu.:   0.0   1st Qu.: 0.0000                            1st Qu.:  11   
##  Median :  46.0   Median : 0.0000                            Median :  92   
##  Mean   : 195.9   Mean   : 0.9099                            Mean   : 392   
##  3rd Qu.: 221.5   3rd Qu.: 0.0000                            3rd Qu.: 392   
##  Max.   :6691.0   Max.   :65.0000                            Max.   :4937   
##  Two.or.more.races Hispanic.or.Latino Not.Hispani.or.Latino Total.housing.units
##  Min.   :  0.00    Min.   :   0.0     Min.   :    0         Min.   :    0      
##  1st Qu.: 15.00    1st Qu.:  74.5     1st Qu.: 1288         1st Qu.:  888      
##  Median : 51.00    Median : 351.0     Median : 2133         Median : 1340      
##  Mean   : 75.97    Mean   : 988.2     Mean   : 2435         Mean   : 1502      
##  3rd Qu.:106.50    3rd Qu.:1333.0     3rd Qu.: 3296         3rd Qu.: 1886      
##  Max.   :750.00    Max.   :7256.0     Max.   :16353         Max.   :12190      
##  Vacant.housing.units Median.housing.value Percent.of.less.than.9th.grade
##  Min.   :   0.0       Min.   : 55700       Min.   : 0.000                
##  1st Qu.:  89.5       1st Qu.:146050       1st Qu.: 2.700                
##  Median : 161.0       Median :207900       Median : 5.800                
##  Mean   : 203.9       Mean   :239365       Mean   : 9.193                
##  3rd Qu.: 252.0       3rd Qu.:297750       3rd Qu.:12.150                
##  Max.   :2023.0       Max.   :814300       Max.   :45.200                
##  Percent.of.9th.to.12th.grade Percent.of.high.school.graduate
##  Min.   : 0.000               Min.   : 0.40                  
##  1st Qu.: 4.300               1st Qu.:16.00                  
##  Median : 9.100               Median :25.90                  
##  Mean   : 9.927               Mean   :24.27                  
##  3rd Qu.:14.300               3rd Qu.:33.10                  
##  Max.   :41.900               Max.   :56.80                  
##  Percent.of.some.college Percent.of.associate.s.degree
##  Min.   : 2.00           Min.   : 0.000               
##  1st Qu.:12.45           1st Qu.: 3.400               
##  Median :18.20           Median : 5.000               
##  Mean   :18.73           Mean   : 5.462               
##  3rd Qu.:24.60           3rd Qu.: 7.150               
##  Max.   :42.90           Max.   :19.000               
##  Percent.of.bachelor.s.degree Percent.of.graduate.or.professional.degree
##  Min.   : 0.00                Min.   : 0.00                             
##  1st Qu.: 7.60                1st Qu.: 3.20                             
##  Median :15.20                Median : 8.20                             
##  Mean   :19.51                Mean   :12.91                             
##  3rd Qu.:28.10                3rd Qu.:20.50                             
##  Max.   :66.20                Max.   :70.50                             
##  Percent.of.employed.population Percent.of.unemployed.population
##  Min.   : 0.00                  Min.   : 0.400                  
##  1st Qu.:43.80                  1st Qu.: 5.500                  
##  Median :56.70                  Median : 8.400                  
##  Mean   :55.74                  Mean   : 9.525                  
##  3rd Qu.:65.25                  3rd Qu.:12.350                  
##  Max.   :90.60                  Max.   :74.100                  
##  Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining
##  Min.   :0.0000                                                              
##  1st Qu.:0.0000                                                              
##  Median :0.0000                                                              
##  Mean   :0.1445                                                              
##  3rd Qu.:0.0000                                                              
##  Max.   :4.0000                                                              
##  Percent.of.population.of.construction Percent.of.population.of.manufacturing
##  Min.   : 0.000                        Min.   : 0.000                        
##  1st Qu.: 1.100                        1st Qu.: 4.400                        
##  Median : 3.000                        Median : 7.300                        
##  Mean   : 3.876                        Mean   : 9.026                        
##  3rd Qu.: 5.900                        3rd Qu.:12.100                        
##  Max.   :21.500                        Max.   :34.400                        
##  Percent.of.population.of.wholesale.trade Percent.of.population.of.retail.trade
##  Min.   : 0.00                            Min.   : 0.000                       
##  1st Qu.: 0.70                            1st Qu.: 6.500                       
##  Median : 1.80                            Median : 8.800                       
##  Mean   : 2.25                            Mean   : 9.346                       
##  3rd Qu.: 3.30                            3rd Qu.:11.800                       
##  Max.   :22.00                            Max.   :31.200                       
##  Percent.of.population.of.transportation..warehousing..and.utilities
##  Min.   : 0.000                                                     
##  1st Qu.: 3.100                                                     
##  Median : 5.600                                                     
##  Mean   : 6.597                                                     
##  3rd Qu.: 9.200                                                     
##  Max.   :33.700                                                     
##  Percent.of.population.of.information
##  Min.   : 0.000                      
##  1st Qu.: 0.700                      
##  Median : 1.900                      
##  Mean   : 2.261                      
##  3rd Qu.: 3.400                      
##  Max.   :12.700                      
##  Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing
##  Min.   : 0.000                                                               
##  1st Qu.: 3.900                                                               
##  Median : 6.500                                                               
##  Mean   : 7.346                                                               
##  3rd Qu.: 9.900                                                               
##  Max.   :26.500                                                               
##  Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services
##  Min.   : 0.00                                                                                               
##  1st Qu.: 9.00                                                                                               
##  Median :12.30                                                                                               
##  Mean   :13.93                                                                                               
##  3rd Qu.:17.20                                                                                               
##  Max.   :40.00                                                                                               
##  Percent.of.population.of.educational.services..health.care..and.social.assistance
##  Min.   : 3.20                                                                    
##  1st Qu.:17.50                                                                    
##  Median :23.00                                                                    
##  Mean   :24.17                                                                    
##  3rd Qu.:30.15                                                                    
##  Max.   :65.90                                                                    
##  Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services
##  Min.   : 0.00                                                                             
##  1st Qu.: 6.85                                                                             
##  Median :10.10                                                                             
##  Mean   :11.17                                                                             
##  3rd Qu.:14.40                                                                             
##  Max.   :40.10                                                                             
##  Percent.of.population.of.public.administration
##  Min.   : 0.00                                 
##  1st Qu.: 3.20                                 
##  Median : 5.00                                 
##  Mean   : 5.27                                 
##  3rd Qu.: 6.80                                 
##  Max.   :20.70                                 
##  Percent.of.population.of.other.services  Mean.income    
##  Min.   : 0.000                          Min.   :  4197  
##  1st Qu.: 1.700                          1st Qu.: 46270  
##  Median : 3.300                          Median : 62842  
##  Mean   : 4.616                          Mean   : 83423  
##  3rd Qu.: 6.350                          3rd Qu.: 97354  
##  Max.   :28.000                          Max.   :399454
#This will give us certain information if we need scaling or not

From the summary statistics, we can observe some abnormal distributions in most of the variables which will play a major role during the fit of the model. These have to be avoided by normalization and generalization methods before fitting the model. This process will be done before fitting the model.

#EXPLORATORY DATA ANALYSIS ## Distribution of crime occurences

ggplot(dataset_s1_copy,aes(x=`The number of crime incidents`)) +
  geom_histogram(aes(y = ..density..), binwidth = 70, fill="cornsilk",color="black") +
  geom_density(adjust=.8, fill="cyan",color="black", alpha=0.4) + 
  labs(x="Crime Occurences",
       y="Density",
       title="Number of Crime Occurences Distribution")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We have a lot of crime occurrences occurring in the range of 0 to 1000. The separation for EDA purpose will be done based on that.

Catgeorizing the crime occurences into several categorizes for visualization purposes

dataset_s1_copy$crime_categories <- ifelse(dataset_s1_copy$`The number of crime incidents` <= 200, 'Less number of crimes (<200)', 
                                          ifelse(dataset_s1_copy$`The number of crime incidents` > 200 &
                                            dataset_s1_copy$`The number of crime incidents` <= 500, '200 to 500 crime rate', 
                                          ifelse(dataset_s1_copy$`The number of crime incidents` > 500 &
                                            dataset_s1_copy$`The number of crime incidents` < 800, '500 to 800 crime rate', 
                                          'High Crime Rate')))

table(dataset_s1_copy$crime_categories)
## 
##        200 to 500 crime rate        500 to 800 crime rate 
##                          346                          100 
##              High Crime Rate Less number of crimes (<200) 
##                           52                          301

Checking Distribution of some important variables to check how it affects

Distribution of different races

White people

average_white <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_white = mean(White, na.rm = TRUE))

ggplot(average_white, aes(x = crime_categories, y = average_white, fill = crime_categories)) +
  geom_bar(stat = 'identity', color = 'black') +
  labs(
    title = 'Average White People - Crime Occurrences',
    x = 'Crime Rates',
    y = 'Average White People',
    fill = "Crime Categories"
  ) + 
  scale_fill_manual(
    values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
    )

A census containing around 800 white people are said to do a higher crime rate. Higher number of white people are most probably said to make less number of crimes.

Black or African American

average_white <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_Black.or.African.American = mean(`Black or African American`, na.rm = TRUE))

ggplot(average_white, aes(x = crime_categories, y = average_Black.or.African.American, fill = crime_categories)) +
  geom_bar(stat = 'identity', color = 'black') +
  labs(
    title = 'Average Black.or.African.American People - Crime Occurrences',
    x = 'Crime Rates',
    y = 'Average Black African American People',
    fill = "Crime Categories"
  ) + 
  scale_fill_manual(
    values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
    )

average_Black.or.African.American people are said to make most crimes. The crime rate is very high here averaging around 3000 crime rates and being the highest.

American.Indian.or.Alaska.Native

average_white <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_American.Indian.or.Alaska.Native = mean(`American Indian or Alaska Native`, na.rm = TRUE))

ggplot(average_white, aes(x = crime_categories, y = average_American.Indian.or.Alaska.Native, fill = crime_categories)) +
  geom_bar(stat = 'identity', color = 'black') +
  labs(
    title = 'Average American.Indian.or.Alaska.Native People - Crime Occurrences',
    x = 'Crime Rates',
    y = 'Average American.Indian.or.Alaska.Nativen People',
    fill = "Crime Categories"
  ) + 
  scale_fill_manual(
    values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
    )

On an average 10 people from American Indian or Alaska Native per census are to perform high crimes.

Asian

average_white <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_Asian = mean(Asian, na.rm = TRUE))

ggplot(average_white, aes(x = crime_categories, y = average_Asian, fill = crime_categories)) +
  geom_bar(stat = 'identity', color = 'black') +
  labs(
    title = 'Average Asian People - Crime Occurrences',
    x = 'Crime Rates',
    y = 'Average Asian People',
    fill = "Crime Categories"
  ) + 
  scale_fill_manual(
    values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
    )

Asian people are expected to make less than 200 crimes on an average. Around 200 people per census make arround 200 crimes or less. 170 people approximately make high number of crimes.

Native Hawaiian and Other Pacific Islander Race

average_white <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_Native.Hawaiian.and.Other.Pacific.Islander = mean(`Native Hawaiian and Other Pacific Islander`, na.rm = TRUE))

ggplot(average_white, aes(x = crime_categories, y = average_Native.Hawaiian.and.Other.Pacific.Islander, fill = crime_categories)) +
  geom_bar(stat = 'identity', color = 'black') +
  labs(
    title = 'Average Hawaii People - Crime Occurrences',
    x = 'Crime Rates',
    y = 'Average Hawaii People',
    fill = "Crime Categories"
  ) + 
  scale_fill_manual(
    values = c('#66FFFF','#FF0033','#FFCC00','#0066CC')
    )

On an average 2 hawaii people per census are to make higher crimes which is far far less.

How does population affect the Crime Rates - Male vs Female

#Male
average_male <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_male = mean(`Male population`, na.rm = TRUE))


plot_male <- ggplot(average_male, aes(x = "", y = average_male, fill = crime_categories)) +
  geom_bar(stat = "identity", width = 1) +
  geom_text(aes(label = round(average_male,2)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start = 0) +
  theme(legend.position = "bottom") +
  labs(title = "Average Male Population - Crimes", fill = "Crime Rate", y = NULL)

average_female <- dataset_s1_copy %>%
  group_by(crime_categories) %>%
  summarize(average_female = mean(`Female population`, na.rm = TRUE))

# Create plot for Female
plot_female <- ggplot(average_female, aes(x = "", y = average_female, fill = crime_categories)) +
  geom_bar(stat = "identity", width = 1) +
  geom_text(aes(label = round(average_female, 2)), position = position_stack(vjust = 0.5)) +
  coord_polar("y", start = 0) +
  theme(legend.position = "bottom") +
  labs(title = "Average Female Population - Crimes", fill = "Crime Rate", y = NULL)

# Combine the plots
grid.arrange(plot_male, plot_female, ncol = 2)

Average of 2580 females have caused high crimes and average of 2300 males have caused high crimes per census.

Does Employment have an effect ?

#Visualization - Density Chart - Employed Population
employed_population <- ggplot(dataset_s1_copy, aes(x = `Employed population`, fill = crime_categories)) +
  geom_density(alpha = 0.3) +
  labs(title = "Density plot for Employed Population", x = "Employed Population", y = "Density") +
  scale_fill_manual(values = c("red", "blue", "green", "orange"))

unemployed_population <- ggplot(dataset_s1_copy, aes(x = `Unemployed population`, fill = crime_categories)) +
  geom_density(alpha = 0.3) +
  labs(title = "Density plot for UnEmployed Population", x = "UnEmployed Population", y = "Density") +
  scale_fill_manual(values = c("red", "blue", "green", "orange"))

grid.arrange(employed_population, unemployed_population, ncol = 1)

We see higher crime rates in Unemployed Population as the population increases. For Employed Population, The crime rate is less.

Unemployed people are more to cause crimes compared to the employed ones.

PREDICTIVE MODELING

##Skewness Check

#Computing the skewness for All Numerical Variables
skewness_df = data.frame(Variable_Name = character(), skewness = numeric(), stringsAsFactors =
                           FALSE)
for(i in names(dataset_s1)){
  if(is.numeric(dataset_s1[[i]])){
    value = skewness(dataset_s1[[i]], na.rm=TRUE)
    skewness_df = rbind(skewness_df, data.frame(Variable_Name = i, skewness = value,
                                              stringsAsFactors = FALSE))
  }

}
skewness_df
## Getting the number of variables having heavy skewness
heavy_skewness = skewness_df[skewness_df$skewness >= 3 | skewness_df$skewness <= -2, ]
heavy_skewness

Checking the structure

dataset_before_cleaning_s1[] <- lapply(dataset_before_cleaning_s1, function(x) ifelse(is.character(x), as.numeric(x), x))
str(dataset_before_cleaning_s1)
## tibble [799 × 79] (S3: tbl_df/tbl/data.frame)
##  $ Census tract                                                                                                : num [1:799] 101 101 101 101 101 101 101 101 101 101 ...
##  $ The number of crime incidents                                                                               : num [1:799] 564 564 564 564 564 564 564 564 564 564 ...
##  $ Total population                                                                                            : num [1:799] 4189 4189 4189 4189 4189 ...
##  $ Male population                                                                                             : num [1:799] 1742 1742 1742 1742 1742 ...
##  $ Percent of male population                                                                                  : num [1:799] 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 41.6 ...
##  $ Female population                                                                                           : num [1:799] 2447 2447 2447 2447 2447 ...
##  $ Percent of female population                                                                                : num [1:799] 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 58.4 ...
##  $ Median age                                                                                                  : num [1:799] 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 34.4 ...
##  $ Under 16 years                                                                                              : num [1:799] 789 789 789 789 789 789 789 789 789 789 ...
##  $ Percent of under 16 years                                                                                   : num [1:799] 18.8 18.8 18.8 18.8 18.8 ...
##  $ Over 65 years                                                                                               : num [1:799] 182 182 182 182 182 182 182 182 182 182 ...
##  $ Percent of over 65 years                                                                                    : num [1:799] 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 4.3 ...
##  $ White                                                                                                       : num [1:799] 2073 2073 2073 2073 2073 ...
##  $ Black or African American                                                                                   : num [1:799] 1687 1687 1687 1687 1687 ...
##  $ American Indian or Alaska Native                                                                            : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian                                                                                                       : num [1:799] 162 162 162 162 162 162 162 162 162 162 ...
##  $ Native Hawaiian and Other Pacific Islander                                                                  : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Some other race                                                                                             : num [1:799] 93 93 93 93 93 93 93 93 93 93 ...
##  $ Two or more races                                                                                           : num [1:799] 174 174 174 174 174 174 174 174 174 174 ...
##  $ Hispanic or Latino                                                                                          : num [1:799] 456 456 456 456 456 456 456 456 456 456 ...
##  $ Not Hispani or Latino                                                                                       : num [1:799] 3733 3733 3733 3733 3733 ...
##  $ Total housing units                                                                                         : num [1:799] 2614 2614 2614 2614 2614 ...
##  $ Occupied housing units                                                                                      : num [1:799] 2147 2147 2147 2147 2147 ...
##  $ Percent of occupied housing units                                                                           : num [1:799] 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 82.1 ...
##  $ Vacant housing units                                                                                        : num [1:799] 467 467 467 467 467 467 467 467 467 467 ...
##  $ Percent of vacant housing units                                                                             : num [1:799] 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 ...
##  $ Median housing value                                                                                        : num [1:799] 191600 191600 191600 191600 191600 ...
##  $ Over 25 years                                                                                               : num [1:799] 2949 2949 2949 2949 2949 ...
##  $ Percent of less than 9th grade                                                                              : num [1:799] 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 ...
##  $ Percent of 9th to 12th grade                                                                                : num [1:799] 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 2.6 ...
##  $ Percent of high school graduate                                                                             : num [1:799] 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5 ...
##  $ Percent of some college                                                                                     : num [1:799] 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 31.8 ...
##  $ Percent of associate’s degree                                                                               : num [1:799] 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 ...
##  $ Percent of bachelor’s degree                                                                                : num [1:799] 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 22.3 ...
##  $ Percent of graduate or professional degree                                                                  : num [1:799] 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 16.2 ...
##  $ Percent of high school graduate or higher                                                                   : num [1:799] 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 95.6 ...
##  $ Percent of bachelor’s degree of higher                                                                      : num [1:799] 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 38.6 ...
##  $ Percent of less than high school graduate                                                                   : num [1:799] 4.4 4.4 4.4 4.4 4.4 ...
##  $ Percent of less than bachelor’s degree                                                                      : num [1:799] 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 61.4 ...
##  $ Over 16 years                                                                                               : num [1:799] 3400 3400 3400 3400 3400 3400 3400 3400 3400 3400 ...
##  $ Percent of over 16 years                                                                                    : num [1:799] 81.2 81.2 81.2 81.2 81.2 ...
##  $ Population in labor force                                                                                   : num [1:799] 2546 2546 2546 2546 2546 ...
##  $ Percent of population in labor force                                                                        : num [1:799] 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 74.9 ...
##  $ Population not in labor force                                                                               : num [1:799] 854 854 854 854 854 854 854 854 854 854 ...
##  $ Percent of population not in labor force                                                                    : num [1:799] 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 25.1 ...
##  $ Employed population                                                                                         : num [1:799] 2161 2161 2161 2161 2161 ...
##  $ Percent of employed population                                                                              : num [1:799] 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 63.6 ...
##  $ Unemployed population                                                                                       : num [1:799] 385 385 385 385 385 385 385 385 385 385 ...
##  $ Percent of unemployed population                                                                            : num [1:799] 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 11.3 ...
##  $ Population not in labor force and unemployed population                                                     : num [1:799] 1239 1239 1239 1239 1239 ...
##  $ Percent of population not in labor force and unemployed population                                          : num [1:799] 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 36.4 ...
##  $ Population of agriculture, forestry, fishing, hunting, and mining                                           : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Percent of population of agriculture, forestry, fishing, hunting, and mining                                : num [1:799] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Population of construction                                                                                  : num [1:799] 135 135 135 135 135 135 135 135 135 135 ...
##  $ Percent of population of construction                                                                       : num [1:799] 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 ...
##  $ Population of manufacturing                                                                                 : num [1:799] 102 102 102 102 102 102 102 102 102 102 ...
##  $ Percent of population of manufacturing                                                                      : num [1:799] 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 ...
##  $ Population of wholesale trade                                                                               : num [1:799] 18 18 18 18 18 18 18 18 18 18 ...
##  $ Percent of population of wholesale trade                                                                    : num [1:799] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
##  $ Population of retail trade                                                                                  : num [1:799] 133 133 133 133 133 133 133 133 133 133 ...
##  $ Percent of population of retail trade                                                                       : num [1:799] 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 ...
##  $ Population of transportation, warehousing, and utilities                                                    : num [1:799] 125 125 125 125 125 125 125 125 125 125 ...
##  $ Percent of population of transportation, warehousing, and utilities                                         : num [1:799] 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 ...
##  $ Population of information                                                                                   : num [1:799] 13 13 13 13 13 13 13 13 13 13 ...
##  $ Percent of population of information                                                                        : num [1:799] 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 0.6 ...
##  $ Population of finance, insurance, real estate, rental, and leasing                                          : num [1:799] 126 126 126 126 126 126 126 126 126 126 ...
##  $ Percent of population of finance, insurance, real estate, rental, and leasing                               : num [1:799] 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 5.8 ...
##  $ Population of professional, scientific, management, administrative, and waste management services           : num [1:799] 304 304 304 304 304 304 304 304 304 304 ...
##  $ Percent of population of professional, scientific, management, administrative, and waste management services: num [1:799] 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 14.1 ...
##  $ Population of educational services, health care, and social assistance                                      : num [1:799] 791 791 791 791 791 791 791 791 791 791 ...
##  $ Percent of population of educational services, health care, and social assistance                           : num [1:799] 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 36.6 ...
##  $ Population of arts, entertainment, recreation, accommodation, and food services                             : num [1:799] 202 202 202 202 202 202 202 202 202 202 ...
##  $ Percent of population of arts, entertainment, recreation, accommodation, and food services                  : num [1:799] 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 9.3 ...
##  $ Population of public administration                                                                         : num [1:799] 111 111 111 111 111 111 111 111 111 111 ...
##  $ Percent of population of public administration                                                              : num [1:799] 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 5.1 ...
##  $ Population of other services                                                                                : num [1:799] 101 101 101 101 101 101 101 101 101 101 ...
##  $ Percent of population of other services                                                                     : num [1:799] 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 4.7 ...
##  $ Median income                                                                                               : num [1:799] 44826 44826 44826 44826 44826 ...
##  $ Mean income                                                                                                 : num [1:799] 58908 58908 58908 58908 58908 ...
##  - attr(*, "na.action")= 'omit' Named int [1:2] 800 801
##   ..- attr(*, "names")= chr [1:2] "800" "801"
crime_data = read.csv("/Users/jasonrayen/Downloads/Jason Masters/Sem 3/AIT 664 (Information, Visualization & Representation)/Project/dataset_s1_cleaned2.csv")
head(crime_data)
str(crime_data)
## 'data.frame':    801 obs. of  39 variables:
##  $ Census.tract                                                                                                : num  101 102 102 103 104 ...
##  $ The.number.of.crime.incidents                                                                               : chr  "564" "452" "458" "360" ...
##  $ Total.population                                                                                            : int  4189 7083 2502 6213 4730 3831 3690 2228 6513 3946 ...
##  $ Median.age                                                                                                  : num  34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
##  $ White                                                                                                       : int  2073 3198 1099 3429 3427 2330 2123 1577 4087 2565 ...
##  $ Black.or.African.American                                                                                   : int  1687 3545 839 1806 660 1190 595 217 1910 655 ...
##  $ American.Indian.or.Alaska.Native                                                                            : int  0 16 6 5 0 0 0 0 4 0 ...
##  $ Asian                                                                                                       : int  162 148 248 741 395 100 638 326 106 200 ...
##  $ Native.Hawaiian.and.Other.Pacific.Islander                                                                  : int  0 0 18 0 0 0 0 0 0 0 ...
##  $ Some.other.race                                                                                             : int  93 41 208 148 93 17 246 12 87 412 ...
##  $ Two.or.more.races                                                                                           : int  174 135 84 84 155 194 88 96 319 114 ...
##  $ Hispanic.or.Latino                                                                                          : int  456 1571 658 964 454 191 578 374 1715 1654 ...
##  $ Not.Hispani.or.Latino                                                                                       : int  3733 5512 1844 5249 4276 3640 3112 1854 4798 2292 ...
##  $ Total.housing.units                                                                                         : int  2614 2995 1236 3258 2178 2559 1956 1343 3261 1658 ...
##  $ Vacant.housing.units                                                                                        : int  467 489 167 563 341 324 389 253 133 129 ...
##  $ Median.housing.value                                                                                        : num  191600 169300 165700 195000 221700 ...
##  $ Percent.of.less.than.9th.grade                                                                              : num  1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
##  $ Percent.of.9th.to.12th.grade                                                                                : num  2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
##  $ Percent.of.high.school.graduate                                                                             : num  21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
##  $ Percent.of.some.college                                                                                     : num  31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
##  $ Percent.of.associate.s.degree                                                                               : num  3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
##  $ Percent.of.bachelor.s.degree                                                                                : num  22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
##  $ Percent.of.graduate.or.professional.degree                                                                  : num  16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
##  $ Percent.of.employed.population                                                                              : num  63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
##  $ Percent.of.unemployed.population                                                                            : num  11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
##  $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining                                : num  0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
##  $ Percent.of.population.of.construction                                                                       : num  6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
##  $ Percent.of.population.of.manufacturing                                                                      : num  4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
##  $ Percent.of.population.of.wholesale.trade                                                                    : num  0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
##  $ Percent.of.population.of.retail.trade                                                                       : num  6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
##  $ Percent.of.population.of.transportation..warehousing..and.utilities                                         : num  5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
##  $ Percent.of.population.of.information                                                                        : num  0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
##  $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing                               : num  5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
##  $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num  14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
##  $ Percent.of.population.of.educational.services..health.care..and.social.assistance                           : num  36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
##  $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services                  : num  9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
##  $ Percent.of.population.of.public.administration                                                              : num  5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
##  $ Percent.of.population.of.other.services                                                                     : num  4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
##  $ Mean.income                                                                                                 : num  58908 68583 54897 83002 96641 ...
crime_data$The.number.of.crime.incidents <- as.numeric(crime_data$The.number.of.crime.incidents)
## Warning: NAs introduced by coercion
print(which(is.na(crime_data$The.number.of.crime.incidents)))
## [1] 800 801
crime_data <- crime_data[!is.na(crime_data$The.number.of.crime.incidents), ]
str(crime_data)
## 'data.frame':    799 obs. of  39 variables:
##  $ Census.tract                                                                                                : num  101 102 102 103 104 ...
##  $ The.number.of.crime.incidents                                                                               : num  564 452 458 360 225 192 110 110 367 175 ...
##  $ Total.population                                                                                            : int  4189 7083 2502 6213 4730 3831 3690 2228 6513 3946 ...
##  $ Median.age                                                                                                  : num  34.4 32.2 40.2 39.6 25.6 38 28.8 35 35.9 34.1 ...
##  $ White                                                                                                       : int  2073 3198 1099 3429 3427 2330 2123 1577 4087 2565 ...
##  $ Black.or.African.American                                                                                   : int  1687 3545 839 1806 660 1190 595 217 1910 655 ...
##  $ American.Indian.or.Alaska.Native                                                                            : int  0 16 6 5 0 0 0 0 4 0 ...
##  $ Asian                                                                                                       : int  162 148 248 741 395 100 638 326 106 200 ...
##  $ Native.Hawaiian.and.Other.Pacific.Islander                                                                  : int  0 0 18 0 0 0 0 0 0 0 ...
##  $ Some.other.race                                                                                             : int  93 41 208 148 93 17 246 12 87 412 ...
##  $ Two.or.more.races                                                                                           : int  174 135 84 84 155 194 88 96 319 114 ...
##  $ Hispanic.or.Latino                                                                                          : int  456 1571 658 964 454 191 578 374 1715 1654 ...
##  $ Not.Hispani.or.Latino                                                                                       : int  3733 5512 1844 5249 4276 3640 3112 1854 4798 2292 ...
##  $ Total.housing.units                                                                                         : int  2614 2995 1236 3258 2178 2559 1956 1343 3261 1658 ...
##  $ Vacant.housing.units                                                                                        : int  467 489 167 563 341 324 389 253 133 129 ...
##  $ Median.housing.value                                                                                        : num  191600 169300 165700 195000 221700 ...
##  $ Percent.of.less.than.9th.grade                                                                              : num  1.8 11.4 8.1 12.3 2.6 0.6 16.3 13.6 7.7 15.2 ...
##  $ Percent.of.9th.to.12th.grade                                                                                : num  2.6 8.5 9.5 10.5 0.6 6.9 4.3 11 4.5 4 ...
##  $ Percent.of.high.school.graduate                                                                             : num  21.5 22.1 21.4 21.5 13.9 9.8 8.3 22.8 20.1 21.1 ...
##  $ Percent.of.some.college                                                                                     : num  31.8 18.7 20.8 19.9 9.7 29.6 18.7 9 16 16 ...
##  $ Percent.of.associate.s.degree                                                                               : num  3.8 7.3 6.2 3.4 5.1 3.9 4.5 1.9 4.6 2.3 ...
##  $ Percent.of.bachelor.s.degree                                                                                : num  22.3 21.2 19.4 19.5 31.3 28.4 33.5 23.4 28.2 23.7 ...
##  $ Percent.of.graduate.or.professional.degree                                                                  : num  16.2 10.7 14.5 12.8 36.7 20.8 14.5 18.3 18.9 17.7 ...
##  $ Percent.of.employed.population                                                                              : num  63.6 58.2 54.4 56.7 56.5 64.2 58.5 52.5 70.9 68.2 ...
##  $ Percent.of.unemployed.population                                                                            : num  11.3 7.5 6.3 5.2 5.9 6.8 4.5 4.5 5.2 8.6 ...
##  $ Percent.of.population.of.agriculture..forestry..fishing..hunting..and.mining                                : num  0 0 0 1.6 0.6 0 0 0.3 0.4 0 ...
##  $ Percent.of.population.of.construction                                                                       : num  6.2 2.4 0.6 3.8 0.9 4.1 0 2.4 6.7 3.4 ...
##  $ Percent.of.population.of.manufacturing                                                                      : num  4.7 4.7 0.7 9.8 2.9 4.1 3.2 6.4 4.9 5 ...
##  $ Percent.of.population.of.wholesale.trade                                                                    : num  0.8 0.7 1.7 0.4 1.1 1.2 0.6 0 1.5 2 ...
##  $ Percent.of.population.of.retail.trade                                                                       : num  6.2 12.3 11.7 7.8 11 7.6 6.4 12.4 11.7 7.8 ...
##  $ Percent.of.population.of.transportation..warehousing..and.utilities                                         : num  5.8 10.6 2.3 2.1 1.7 4 3 4.5 2.6 3.7 ...
##  $ Percent.of.population.of.information                                                                        : num  0.6 2 4 2 1.6 3.8 3.5 0.8 3.6 3 ...
##  $ Percent.of.population.of.finance..insurance..real.estate..rental..and.leasing                               : num  5.8 4.3 2 5.6 6.2 9.5 6 11.4 7.9 7.7 ...
##  $ Percent.of.population.of.professional..scientific..management..administrative..and.waste.management.services: num  14.1 11.3 8.8 19.6 12 8.7 10.4 9.1 13.2 12.5 ...
##  $ Percent.of.population.of.educational.services..health.care..and.social.assistance                           : num  36.6 31.4 40.1 26.6 34.9 32.1 28.3 27.4 27.4 26.5 ...
##  $ Percent.of.population.of.arts..entertainment..recreation..accommodation..and.food.services                  : num  9.3 7.6 16 12.1 18.3 11.7 28.9 15.4 13.9 18 ...
##  $ Percent.of.population.of.public.administration                                                              : num  5.1 8.3 11.3 6.2 7.4 5.9 6.1 8.1 5.3 9.5 ...
##  $ Percent.of.population.of.other.services                                                                     : num  4.7 4.4 0.8 2.4 1.4 7.3 3.5 1.9 1.1 0.8 ...
##  $ Mean.income                                                                                                 : num  58908 68583 54897 83002 96641 ...
#importing libraries
library(ggplot2)
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data <- crime_data
data <- mutate(data,
               Percent.of.less.than.high.school = rowSums(select(data, starts_with("Percent.of.less.than.9th.grade"), "Percent.of.9th.to.12th.grade")) / data$Total.population * 100,
               Percent.of.high.school.graduate.or.higher = rowSums(select(data, starts_with("Percent.of.high.school.graduate"), ends_with("Percent.of.graduate.or.professional.degree"))) / data$Total.population * 100
               )
library(ggplot2)
library(plotly)

# Combine all education columns into a single dataframe
education_data <- data.frame(
  Less_than_9th = crime_data$Percent.of.less.than.9th.grade,
  Grade_9th_to_12th = crime_data$Percent.of.9th.to.12th.grade,
  High_school_graduate = crime_data$Percent.of.high.school.graduate,
  Some_college = crime_data$Percent.of.some.college,
  Associate_degree = crime_data$Percent.of.associate.s.degree,
  Bachelor_degree = crime_data$Percent.of.bachelor.s.degree,
  Graduate_professional_degree = crime_data$Percent.of.graduate.or.professional.degree,
  Crime_incidents = crime_data$The.number.of.crime.incidents
)

# Filter data for points below 2000 on the y-axis
filtered_data <- subset(education_data, Crime_incidents < 2000)

# Reshape data for ggplot
education_data_long <- tidyr::pivot_longer(filtered_data,
                                           cols = -Crime_incidents,
                                           names_to = "Education_column",
                                           values_to = "Percent")

# Define the order of education columns
education_order <- c("Less_than_9th", "Grade_9th_to_12th", "High_school_graduate", 
                     "Some_college", "Associate_degree", "Bachelor_degree", 
                     "Graduate_professional_degree")

# Convert Education_column to factor with defined order
education_data_long$Education_column <- factor(education_data_long$Education_column, levels = education_order)

# Plot using ggplot
p <- ggplot(education_data_long, aes(x = Percent, y = Crime_incidents, color = Education_column)) +
  geom_point(size = 0.5) +  
  labs(x = "Percent of Education Level",
       y = "Number of Crime Incidents",
       title = "Crime Incidents vs Education Level") +
  theme_light()  

# Convert ggplot to plotly
plotly_plot <- ggplotly(p)

# Show the interactive plot
plotly_plot
library(dplyr)
library(plotly)

# Create bins for median age
crime_data <- mutate(crime_data,
                     age_bin = case_when(
                       Median.age >= 20 & Median.age < 30 ~ "20-30",
                       Median.age >= 30 & Median.age < 40 ~ "30-40",
                       Median.age >= 40 & Median.age < 50 ~ "40-50",
                       Median.age >= 50 & Median.age < 60 ~ "50-60",
                       TRUE ~ "Other"
                     ))

# Group by age bins and count the number of crime incidents
crime_counts <- crime_data %>%
  group_by(age_bin) %>%
  summarize(crime_count = sum(The.number.of.crime.incidents))

# Create an interactive bar plot with Plotly
plot_ly(crime_counts, x = ~age_bin, y = ~crime_count, type = "bar") %>%
  layout(title = "Crime Incidents by Age",
         xaxis = list(title = "Age Group"),
         yaxis = list(title = "Count of Crime Incidents"))
crime_data$Housing_Level <- cut(crime_data$Median.housing.value, breaks = 3, labels = c("Low", "Medium", "High"))
p <- ggplot(crime_data, aes(x = Housing_Level, y = The.number.of.crime.incidents, fill = Housing_Level)) +
  geom_boxplot() +
  labs(x = "Housing Value",
       y = "Crime Incidents Level",
       title = "Relationship Between Housing Value and Crime Incidents") +
  scale_fill_manual(values = c("Low" = "lightblue", "Medium" = "lightgreen", "High" = "lightcoral")) +
  theme_minimal()
# Calculate the 1st and 3rd quartiles
q1 <- quantile(crime_data$The.number.of.crime.incidents, probs = 0.25)
q3 <- quantile(crime_data$The.number.of.crime.incidents, probs = 0.75)

# Calculate the interquartile range (IQR)
iqr <- q3 - q1

# Set the upper and lower bounds for outliers
upper_bound <- q3 + 1.5 * iqr
lower_bound <- q1 - 1.5 * iqr

# Create a filtered version of the data without outliers
filtered_data <- subset(crime_data, The.number.of.crime.incidents >= lower_bound & The.number.of.crime.incidents <= upper_bound)

# Create a boxplot with the filtered data
p <- ggplot(filtered_data, aes(x = Housing_Level, y = The.number.of.crime.incidents, fill = Housing_Level)) +
  geom_boxplot() +
  labs(x = "Housing Value",
       y = "Crime Incidents Level",
       title = "Relationship Between Housing Value and Crime Incidents") +
  scale_fill_manual(values = c("Low" = "lightblue", "Medium" = "lightgreen", "High" = "lightcoral")) +
  theme_minimal()

# Show the plot
print(p)